library(caret)
library(lme4)
library(lmerTest)
library(ggplot2)
library(stringi)
library(gridExtra)
library(ggfortify)
library(dendextend)
library(psych)
library(kableExtra)
library(tidyverse)
library(dtplyr)
source('other_functions.R')
source('plotting_functions.R')

This notebook presents isobaric labeling data analysis strategy that includes model-based normalization [data-driven normalization].

We will check how varying analysis components [unit/summarization/normalization/differential abundance testing methods] changes end results of a quantitative proteomic study.

data.list <- readRDS('input_data.rds')
dat.l <- data.list$dat.l # data in long format

# keep spectra with (isolation interference <=30 or NA) and no missing quantification channels
dat.l <- dat.l %>% filter(isoInterOk & noNAs)

# which proteins were spiked in?
spiked.proteins <- dat.l %>% distinct(Protein) %>% filter(stri_detect(Protein, fixed='ups')) %>% pull %>% as.character

#TEMP -remove!
# tmp=dat.l %>% distinct(Protein) %>% pull %>% as.character
# dat.l <- dat.l %>% filter(Protein %in% c(tmp[sample(1:length(tmp), size=20)], spiked.proteins))
# specify # of varying component variants and their names
variant.names <- c('log2Intensity', 'Intensity', 'Ratio')
n.comp.variants <- length(variant.names)
scale.vec <- c('log', 'raw', 'raw') # ratios are considered raw, because they are basically mean-normalized intensities
# pick reference condition for making plots / doing DEA
referenceCondition <- '0.5'
# specify colours corresponding to biological conditions
condition.colour <- tribble(
  ~Condition, ~Colour,
  "0.125", 'black',
  "0.5", 'blue',
  "0.667", 'green',
  "1", 'red' )
# create data frame with sample info (distinct Run,Channel, Condition, Colour)
sample.info <- get_sample_info(dat.l, condition.colour)

1 Unit component

dat.unit.l <- emptyList(variant.names)

1.1 log2 transformation of reporter ion intensities

dat.unit.l$log2Intensity <- dat.l %>% mutate(response=log2(Intensity)) %>% select(-Intensity)

1.2 original scale (not log2-transformed) of reporter ion intensities

dat.unit.l$Intensity <- dat.l %>% rename(response=Intensity)

1.3 intensity ratios (with respect to average within run)

Calculate ratio’s (per PSM) with respect to average intensity within run, or in other words: each value is divided by the row mean.

# use half-wide data to compute within-run average of PSM channels corresponding to the reference Condition
refCols <- sample.info %>% distinct(Channel) %>% pull(Channel)
denom.df=dat.l %>% pivot_wider(id_cols=-one_of('Condition', 'BioReplicate'),names_from='Channel', values_from='Intensity')
denom.df$denom=apply(denom.df[,refCols], 1, function(x) mean(x, na.rm=T))
denom.df=denom.df[,c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM', 'denom')]
dat.unit.l$Ratio <- dat.l %>% left_join(denom.df[c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM', 'denom')], by=c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM')) %>% mutate(response=Intensity/denom) %>% select(-c(Intensity, denom)) 

2 Summarization component

# no summarization 
dat.summ.l <- dat.unit.l

2.1 no summarization

3 Normalization component

3.1 mixed model spec 1

dat.norm.l <- dat.summ.l
# fit normalization model
norm.models <- lapply(dat.summ.l, function(x) return(lmer(response ~ Mixture + Mixture:TechRepMixture + Mixture:TechRepMixture:Channel + (1|Protein) + (1|Mixture:TechRepMixture:Peptide), data=x)))
# assign normalized values
for (i in seq_along(variant.names)){ dat.norm.l[[variant.names[i]]]$response <- residuals(norm.models[[variant.names[i]]]) }
# apply the 'fix' - add the model intercept to the residuals
dat.norm.l$IntensityFix <- dat.norm.l$Intensity; dat.norm.l$IntensityFix$response <- dat.norm.l$IntensityFix$response + fixef(norm.models$Intensity)['(Intercept)']
dat.norm.l$RatioFix <- dat.norm.l$Ratio; dat.norm.l$RatioFix$response <- dat.norm.l$RatioFix$response + fixef(norm.models$Ratio)['(Intercept)']
# update variant names, #, scale
variant.names <- names(dat.norm.l)
scale.vec <- c(scale.vec, 'raw', 'raw')
n.comp.variants <- length(variant.names)
tmp <- c("log2Intensity", "Intensity", "IntensityFix", "Ratio", "RatioFix")
variant.names <- tmp
dat.norm.l <- dat.norm.l[tmp]
# for technical reasons, assign new variants to dat.summ.l
dat.summ.l$IntensityFix <- dat.summ.l$Intensity
dat.summ.l$RatioFix <- dat.summ.l$Ratio
dat.summ.l <- dat.summ.l[tmp]

4 QC plots

# PSM data needs to be aggregated prior to PCA plots and HC plots (they require features in the intersection of all MS runs - this is not possible for PSM data). In order to be consisent, other normalization plots will be based on the aggregated data
dat.nonnorm.summ.l <- lapply(dat.summ.l, function(x) aggFunc(x, 'response', group.vars=c('Mixture', 'TechRepMixture', 'Run', 'Channel', 'Condition', 'BioReplicate', 'Protein', 'Peptide'), 'median')) # before normalization (you still need to aggregate the data because of 'dat.summ.l <- dat.unit.l' operation)
dat.nonnorm.summ.l <- lapply(dat.nonnorm.summ.l, function(x) aggFunc(x, 'response', group.vars=c('Mixture', 'TechRepMixture', 'Run', 'Channel', 'Condition', 'BioReplicate', 'Protein'), 'median'))
dat.norm.summ.l <- lapply(dat.norm.l, function(x) aggFunc(x, 'response', group.vars=c('Mixture', 'TechRepMixture', 'Run', 'Channel', 'Condition', 'BioReplicate', 'Protein', 'Peptide'), 'median')) # after normalization
dat.norm.summ.l <- lapply(dat.norm.summ.l, function(x) aggFunc(x, 'response', group.vars=c('Mixture', 'TechRepMixture', 'Run', 'Channel', 'Condition', 'BioReplicate', 'Protein'), 'median')) # after normalization

# now create data sets in wide format
# before normalization
dat.nonnorm.summ.w <- lapply(dat.nonnorm.summ.l, function(x) {
  dat.tmp <- pivot_wider(data=x, id_cols=Protein, names_from=Run:Channel, values_from=response, names_sep=':') %>% column_to_rownames('Protein')
  return(dat.tmp) } )

# after normalization
dat.norm.summ.w <- lapply(dat.norm.summ.l, function(x) {
  dat.tmp <- pivot_wider(data=x, id_cols=Protein, names_from=Run:Channel, values_from=response, names_sep=':') %>% column_to_rownames('Protein')
  return(dat.tmp) } )

4.1 Boxplot:

par(mfrow=c(1,2))
for (i in 1:n.comp.variants){
  boxplot_ils(dat.summ.l[[i]], paste('Before normalization', variant.names[i], sep='_'))
  boxplot_ils(dat.norm.l[[i]], paste('After normalization', variant.names[i], sep='_'))}

par(mfrow=c(1,1))

4.2 MA plots

MA plots of two single samples taken from condition 1 and condition 0.125, measured in different MS runs (samples Mixture2_1:127C and Mixture1_2:129N, respectively).

# different unit variants require different computation of fold changes and average abundance: additive or multiplicative scale; see maplot_ils function 
for (i in 1: n.comp.variants){
  p1 <- maplot_ils(dat.nonnorm.summ.w[[i]], 'Mixture2_1:127C', 'Mixture1_2:129N', scale.vec[i], paste('Before normalization', variant.names[i], sep='_'), spiked.proteins)
  p2 <- maplot_ils(dat.norm.summ.w[[i]], 'Mixture2_1:127C', 'Mixture1_2:129N', scale.vec[i], paste('After normalization', variant.names[i], sep='_'), spiked.proteins)
  grid.arrange(p1, p2, ncol=2)}  

MA plots of all samples from condition 1 and condition 0.125 (quantification values averaged within condition).

# different unit variants require different computation of fold changes and average abundance: additive or multiplicative scale; see maplot_ils function 
samples.num <- sample.info %>% filter(Condition=='1') %>% distinct(Run:Channel) %>% pull
samples.denom <- sample.info %>% filter(Condition=='0.125') %>% distinct(Run:Channel) %>% pull
for (i in 1: n.comp.variants){
  p1 <- maplot_ils(dat.nonnorm.summ.w[[i]], samples.num, samples.denom, scale=scale.vec[i], paste('Before normalization', variant.names[i], sep='_'), spiked.proteins)
  p2 <- maplot_ils(dat.norm.summ.w[[i]], samples.num, samples.denom, scale=scale.vec[i], paste('After normalization', variant.names[i], sep='_'), spiked.proteins)
  grid.arrange(p1, p2, ncol=2)}

4.3 CV (coefficient of variation) plots

par(mfrow=c(1,2))
for (i in 1: n.comp.variants){
  cvplot_ils(dat=dat.nonnorm.summ.l[[i]], feature.group='Protein', xaxis.group='Condition', title=paste('Before normalization', variant.names[i], sep='_'), abs=T)
  cvplot_ils(dat=dat.norm.summ.l[[i]], feature.group='Protein', xaxis.group='Condition', title=paste('After normalization', variant.names[i], sep='_'), abs=T)}

par(mfrow=c(1,1))

4.4 PCA plots

4.4.1 Using all proteins

par(mfrow=c(1, 2))
for (i in seq_along(dat.norm.summ.w)){
  if (variant.names[i] %in% c('Intensity', 'IntensityFix')) pcaplot_ils(dat.nonnorm.summ.w[[i]], info=sample.info, paste('Raw', variant.names[i], sep='_'), scale=T) else pcaplot_ils(dat.nonnorm.summ.w[[i]], info=sample.info, paste('Raw', variant.names[i], sep='_'))
  pcaplot_ils(dat.norm.summ.w[[i]], info=sample.info, paste('Normalized', variant.names[i], sep='_'))}

par(mfrow=c(1, 1))

4.4.2 Using spiked proteins only

par(mfrow=c(1, 2))
for (i in seq_along(dat.norm.summ.w)){
  if (variant.names[i] %in% c('Intensity', 'IntensityFix')) pcaplot_ils(dat.nonnorm.summ.w[[i]] %>% filter(rownames(dat.nonnorm.summ.w[[i]]) %in% spiked.proteins), info=sample.info, paste('Raw', variant.names[i], sep='_'), scale=T) else pcaplot_ils(dat.nonnorm.summ.w[[i]] %>% filter(rownames(dat.nonnorm.summ.w[[i]]) %in% spiked.proteins), info=sample.info, paste('Raw', variant.names[i], sep='_'))
  pcaplot_ils(dat.norm.summ.w[[i]] %>% filter(rownames(dat.nonnorm.summ.w[[i]]) %in% spiked.proteins), info=sample.info, paste('Normalized', variant.names[i], sep='_'))}

par(mfrow=c(1, 1))

HC (hierarchical clustering) plots

4.4.3 Using all proteins

par(mfrow=c(1,2))
for (i in 1: n.comp.variants){
  dendrogram_ils(dat.nonnorm.summ.w[[i]], info=sample.info, paste('Before normalization', variant.names[i], sep='_'))
  dendrogram_ils(dat.norm.summ.w[[i]], info=sample.info, paste('After normalization', variant.names[i], sep='_'))}

par(mfrow=c(1, 1))

4.5 P-values of One-way ANOVA (Run effect)

par(mfrow=c(1, 2))
for (i in seq_along(dat.norm.summ.l)){
  hist_ils(run_test(dat.nonnorm.summ.l[[i]])$pvalue, main=paste('Raw', variant.names[i], sep='_'))
  hist_ils(run_test(dat.norm.summ.l[[i]])$pvalue, main=paste('Normalized', variant.names[i], sep='_'))}

par(mfrow=c(1, 1))  

5 DEA component

5.1 mixed model (intra-protein correlation) + eBayes

dat.dea <- emptyList(variant.names)
for(i in seq_along(dat.dea)){
  dat.dea[[i]] <- lmm_dea(dat=dat.norm.l[[i]], mod.formula='response ~ Condition + (1|Run:Channel)', scale=scale.vec[i], referenceCondition)
}

# character vectors containing logFC and p-values columns
dea.cols <- colnames(dat.dea[[1]])
logFC.cols <- dea.cols[stri_detect_fixed(dea.cols, 'logFC')]
significance.cols <- dea.cols[stri_detect_fixed(dea.cols, 'q.mod')]
n.contrasts <- length(logFC.cols)

6 Results comparison

6.1 Confusion matrix

cm <- conf_mat(dat.dea, 'q.mod', 0.05, spiked.proteins)
print_conf_mat(cm, referenceCondition)
0.125 vs 0.5 contrast
log2Intensity
Intensity
IntensityFix
Ratio
RatioFix
background spiked background spiked background spiked background spiked background spiked
not_DE 4060 4 4015 13 4016 13 4060 5 4060 5
DE 3 15 48 6 48 6 4 14 4 14
0.125 vs 0.5 contrast
log2Intensity Intensity IntensityFix Ratio RatioFix
Accuracy 0.998 0.985 0.985 0.998 0.998
Sensitivity 0.789 0.316 0.316 0.737 0.737
Specificity 0.999 0.988 0.988 0.999 0.999
PPV 0.833 0.111 0.111 0.778 0.778
NPV 0.999 0.997 0.997 0.999 0.999
0.667 vs 0.5 contrast
log2Intensity
Intensity
IntensityFix
Ratio
RatioFix
background spiked background spiked background spiked background spiked background spiked
not_DE 4063 17 4020 19 4021 19 4062 15 4062 15
DE 0 2 43 0 43 0 2 4 2 4
0.667 vs 0.5 contrast
log2Intensity Intensity IntensityFix Ratio RatioFix
Accuracy 0.996 0.985 0.985 0.996 0.996
Sensitivity 0.105 0.000 0.000 0.211 0.211
Specificity 1.000 0.989 0.989 1.000 1.000
PPV 1.000 0.000 0.000 0.667 0.667
NPV 0.996 0.995 0.995 0.996 0.996
1 vs 0.5 contrast
log2Intensity
Intensity
IntensityFix
Ratio
RatioFix
background spiked background spiked background spiked background spiked background spiked
not_DE 4060 4 4061 10 4062 10 4059 4 4059 4
DE 3 15 2 9 2 9 5 15 5 15
1 vs 0.5 contrast
log2Intensity Intensity IntensityFix Ratio RatioFix
Accuracy 0.998 0.997 0.997 0.998 0.998
Sensitivity 0.789 0.474 0.474 0.789 0.789
Specificity 0.999 1.000 1.000 0.999 0.999
PPV 0.833 0.818 0.818 0.750 0.750
NPV 0.999 0.998 0.998 0.999 0.999

6.2 Scatter plots

scatterplot_ils(dat.dea, significance.cols, 'q-values', spiked.proteins)

scatterplot_ils(dat.dea, logFC.cols, 'log2FC', spiked.proteins)

6.3 Volcano plots

for (i in 1:n.contrasts){
  volcanoplot_ils(dat.dea, i, spiked.proteins) }

6.4 Violin plots

Let’s see whether the spiked protein fold changes make sense

# plot theoretical value (horizontal lines) and violin per variant
violinplot_ils(lapply(dat.dea, function(x) x[spiked.proteins, logFC.cols]), referenceCondition)

7 Conclusions

8 Session information

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252  LC_CTYPE=English_United Kingdom.1252   
## [3] LC_MONETARY=English_United Kingdom.1252 LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] dtplyr_1.0.1      forcats_0.5.0     stringr_1.4.0     dplyr_1.0.2       purrr_0.3.4      
##  [6] readr_1.4.0       tidyr_1.1.2       tibble_3.0.4      tidyverse_1.3.0   kableExtra_1.3.1 
## [11] psych_2.0.9       dendextend_1.14.0 ggfortify_0.4.11  gridExtra_2.3     stringi_1.5.3    
## [16] lmerTest_3.1-3    lme4_1.1-25       Matrix_1.2-18     caret_6.0-86      ggplot2_3.3.2    
## [21] lattice_0.20-41   rmarkdown_2.5    
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1          backports_1.1.10      plyr_1.8.6            splines_4.0.3        
##   [5] dplR_1.7.1            BiocParallel_1.22.0   TH.data_1.0-10        digest_0.6.26        
##   [9] foreach_1.5.1         htmltools_0.5.0       viridis_0.5.1         fansi_0.4.1          
##  [13] ROTS_1.16.0           magrittr_1.5          doParallel_1.0.16     limma_3.44.3         
##  [17] recipes_0.1.15        modelr_0.1.8          gower_0.2.2           matrixStats_0.57.0   
##  [21] R.utils_2.10.1        sandwich_3.0-0        colorspace_1.4-1      signal_0.7-6         
##  [25] rvest_0.3.6           JGmisc_0.3            haven_2.3.1           xfun_0.18            
##  [29] crayon_1.3.4          jsonlite_1.7.1        libcoin_1.0-6         impute_1.62.0        
##  [33] survival_3.2-7        zoo_1.8-8             iterators_1.0.13      glue_1.4.2           
##  [37] gtable_0.3.0          zlibbioc_1.34.0       ipred_0.9-9           webshot_0.5.2        
##  [41] NOMAD_0.99.0          BiocGenerics_0.34.0   scales_1.1.1          vsn_3.56.0           
##  [45] mvtnorm_1.1-1         DBI_1.1.0             Rcpp_1.0.5            mzR_2.22.0           
##  [49] viridisLite_0.3.0     tmvnsim_1.0-2         CONSTANd_0.99.0       preprocessCore_1.50.0
##  [53] matrixTests_0.1.9     stats4_4.0.3          lava_1.6.8.1          prodlim_2019.11.13   
##  [57] httr_1.4.2            modeltools_0.2-23     ellipsis_0.3.1        R.methodsS3_1.8.1    
##  [61] pkgconfig_2.0.3       XML_3.99-0.5          farver_2.0.3          nnet_7.3-14          
##  [65] dbplyr_2.0.0          utf8_1.1.4            tidyselect_1.1.0      labeling_0.4.2       
##  [69] rlang_0.4.8           reshape2_1.4.4        munsell_0.5.0         cellranger_1.1.0     
##  [73] tools_4.0.3           cli_2.1.0             generics_0.1.0        broom_0.7.2          
##  [77] evaluate_0.14         mzID_1.26.0           yaml_2.2.1            ModelMetrics_1.2.2.2 
##  [81] knitr_1.30            fs_1.5.0              ncdf4_1.17            coin_1.3-1           
##  [85] DEqMS_1.6.0           nlme_3.1-149          R.oo_1.24.0           xml2_1.3.2           
##  [89] compiler_4.0.3        rstudioapi_0.12       png_0.1-7             affyio_1.58.0        
##  [93] e1071_1.7-4           reprex_0.3.0          statmod_1.4.35        highr_0.8            
##  [97] MSnbase_2.14.2        ProtGenerics_1.20.0   nloptr_1.2.2.2        vctrs_0.3.4          
## [101] pillar_1.4.6          lifecycle_0.2.0       BiocManager_1.30.10   MALDIquant_1.19.3    
## [105] data.table_1.13.2     R6_2.5.0              pcaMethods_1.80.0     affy_1.66.0          
## [109] IRanges_2.22.2        codetools_0.2-16      boot_1.3-25           MASS_7.3-53          
## [113] assertthat_0.2.1      withr_2.3.0           mnormt_2.0.2          multcomp_1.4-14      
## [117] S4Vectors_0.26.1      mgcv_1.8-33           parallel_4.0.3        hms_0.5.3            
## [121] grid_4.0.3            rpart_4.1-15          timeDate_3043.102     class_7.3-17         
## [125] minqa_1.2.4           pROC_1.16.2           numDeriv_2016.8-1.1   Biobase_2.48.0       
## [129] lubridate_1.7.9